******************************************************************************************
* ONLINE APPENDIX FIGURE A9 
* 
* WHERE ARE THE IMMIGRANTS FROM VENEZUELAN CONCENTRATED IN NATIVES' WAGE DISTRIBUTION? 
* THE IDEA HERE IS TO GET SOMETHING SIMILAR TO FIGURE 1 IN DUSTANN, FRATTINI, AND PRESTON (2013)
* FOR THE CASE OF THE VENEZULANS IN COLOMBIA
* 
* LAST MODIFIED: JULY, 2025
* 
* For consistency with Dustmann etal 2013, The following sample choices are made:
* 1) we restrict the empirical analysis to individuals aged from 15 to 64, 
* 2) who are in the labor force (i.e., employed or unemployed) and not enrolled at school. 
* 3) I also exclude the self- employed (farmers and entrepreneurs) 
* since the income-setting mechanism in these occupations should differ 
* from the wage-setting of all other workers. 
* I systematically use weights computed by the DANE when aggregating the data. 
******************************************************************************************


clear all 

* INSTALL SCHEMES FOR GRAPHS
*ssc install blindschemes, replace all 
*ssc install schemepack, replace /*based on https://github.com/asjadnaqvi/Stata-schemes*/

**# DIRECTORIES

* PAPER DIRECTORY DROPBOX
global fiscal "/Users/andres/Library/CloudStorage/Dropbox/2 Papers/2023/Paper_Fiscal_Costs"

* SUBDIRECTORIES
global data "$fiscal/1 Data/12 Final Datasets"
global programs "$fiscal/2 Programs/2 Figures and Tables paper"
global results "$fiscal/3 Results"

**# LOAD DATASET AND CREATE NEW VARIABLES

* LOAD DATASET WITH VARIABLES NEEDED
use "$data/LaborEffects.dta", clear

**# SAMPLE SELECTION 

* KEEP ONLY WORKING-AGE POPULATION
drop if (age<15|age>64) 

* KEEP ACTIVE IN THE LABOR FORCE
keep if labforce == 1 

* KEEP IF NOT ENROLLED IN SCHOOL
keep if att_school == 2 

* KEEP IF NOT SELF-EMPLOYED
*keep if p6430 != 4

* LOCATED IN METROPOLITAN AREAS
keep if area !=. 

* DROP ALL OTHER FOREIGN BORN
drop if immig  == 5 

* DISPLAY FORMAT FOR SAMPLING WEIGHTS 
format %12.0f fex12

**# CREATE ADDITIONAL VARIABLES

* CREATE EDUCATION/SKILL VARIABLES 
tab skill_group, gen(skill_group) /*THREE SKILL/EDUCATION CATEGORIES*/

* GENERATE INTERACTION BETWEEN EDUCATION AND AGE CATEGORIES
* THREE EDUCATION/SKILL CATEGORES X 5 AGE GROUPS
egen skill_groupXAgeGroups = group(skill_group AgeGroups)
tab skill_groupXAgeGroups, gen(skill_groupXAgeGroups)

* AGE2
gen ageSQ = age^2  

* GENERATE QUARTER DUMMIES 
gen qdate = qofd(mdy(month, 1, year)) 
format qdate %tq
tab qdate
gen qdate1 = .
gen qdate2 = . 
gen qdate3 = .
gen qdate4 = .
local year "2013 2014 2015 2016 2017 2018 2019"
foreach j of local year {
tab qdate if year == `j', gen(qdate_) 
replace qdate1 = qdate_1 if year == `j'
replace qdate2 = qdate_2 if year == `j'
replace qdate3 = qdate_3 if year == `j'
replace qdate4 = qdate_4 if year == `j'
drop qdate_1 qdate_2 qdate_3 qdate_4 
}

* GENERATE DUMMIES FOR BOGOTA, MEDELLIN, AND CALI
tab area 
gen bogota=(area==11)
gen medellin=(area==5)
gen cali=(area==76)

**# HANDLING OUTLIERS
local year "2013 2014 2015 2016 2017 2018 2019"
foreach j of local year {
display "this is year `j'"
centile hourly_real_wages if year == `j', cent(0.5 99.5)
return list 
local cent1 = r(c_1)
display `cent1'
local cent99 = r(c_2)
display `cent99'
* keep only de wage distribution between 1 and 99
drop if hourly_real_wages > `cent99'  
drop if hourly_real_wages < `cent1' 
}

* GEN ID
gen id = _n

* KEEP ONLY 2014-2018 
drop if year == 2013
drop if year == 2019

* KEEP VARIABLES TO RUN REGRESSIONS
keep id fex12 mthly_real_wages hourly_real_wages log_hourly_real_wages ///
age ageSQ sex qdate* bogota medellin cali AgeGroups* skill_group skill_group* skill_groupXAgeGroups* year immig
drop qdate 
drop AgeGroups1 AgeGroups5


* TO TEST THE REGRESSIONS 
*regress log_hourly_real_wages  qdate2-qdate4 bogota medellin cali AgeGroups3-AgeGroups4 skill_group2-skill_group3 skill_groupXAgeGroups2-skill_groupXAgeGroups9 if sex==1 & year == 2016 & immig == 0 [pw = fex12], robust

*regress log_hourly_real_wages  qdate2-qdate4 bogota medellin cali i.AgeGroups##i.skill_group if sex==1 & year == 2016 & immig == 0 [pw = fex12], robust

*regress log_hourly_real_wages  qdate2-qdate4 bogota medellin cali c.age#c.age c.age##i.skill_group if sex==1 & year == 2016 & immig == 0 [pw = fex12], robust


* RUN WAGE REGRESSIONS AND GET WAGE PREDICTIONS FOR NATIVES FOR EACH YEAR AND SEX
gen sigma_sqaux=.
gen resid=.

quietly sum year
local ymin=r(min)
local ymax=r(max)
forvalues i = `ymin'(1)`ymax' {
	forvalues k = 1/2 {
	display "This is year `i' and sex `k'"
        *capture {
		regress log_hourly_real_wages  qdate2-qdate4 bogota medellin cali i.AgeGroups##i.skill_group if sex == `k' & year == `i' & immig == 0 [pw = fex12], robust
        predict pldhw_`k'_`i' if year==`i' & sex==`k'
		predict sigma if e(sample), resid
		replace sigma_sqaux=sigma^2 if year==`i' & sex==`k'
        drop sigma
		*}
	}
}

forvalues i=`ymin'(1)`ymax' {
	forvalues k=1/2 {
	capture {
                if `i'==`ymin' &`k'==1 {
		    gen lnPredWage = pldhw_`k'_`i'
		    }
                else {
		    replace lnPredWage= pldhw_`k'_`i' if year==`i' & sex==`k'
                drop pldhw_`k'_`i' 
				}
		   }
     }
}

*drop pldhw_1_2013

* ADD ERROR TERM TO THE PREDICTION
quietly sum AgeGroups 
local agemin=r(min)
local agemax=r(max)

quietly sum skill_group
local edumin=r(min)
local edumax=r(max)

sort id
gen lnPredWageX=.
set seed 19811978
forvalues i=`agemin'/`agemax' {
	forvalues k=1/2 {
		forvalues j=`edumin'/`edumax' {
			sum sigma_sqaux if AgeGroups==`i' & sex==`k' & skill_group==`j'
		    matrix S = sqrt(r(mean))
      		drawnorm X, means(0) sds(S)
      		replace lnPredWageX=(lnPredWage + X) if AgeGroups==`i' & sex==`k' & skill_group==`j'
			drop X
			}
		}
	}

drop if lnPredWageX==.


* RENAME TO FACILITATE ANALYSIS 
rename log_hourly_real_wages lnw

* ORDER VARIABLES TO FACILITATE ANALYSIS
order lnw, before(lnPredWage)

* TAG NATIVES AND ALL VENEZUELANS
gen x = . 
replace x = 0 if immig >= 1  /* all migrants from venezuela: 2 = repatriates; 3 = Venezuelans short term; 4 = Venezuelans long term*/
replace x = 1 if immig == 0 /*natives*/
tab x

* ACTUAL POSITION OF VENEZUELANS AND NATIVES 
sort year lnw
by year: gen rank = sum(x) /*rank natives wages*/
by year: egen totwage = sum(x) /*count total natives*/
gen natbel = rank / totwage /*get natives position*/

* PREDICTED POSITION + X OF VENEZULANS AND NATIVES
replace lnPredWageX = lnw if immig == 0 /* use actual instead of predicted wages for natives */
sort year lnPredWageX
by year: gen rank_pred = sum(x)
gen natbel_pred = rank_pred / totwage

* PREDICTED POSITION OF VENEZEULANS AND NATIVES
replace lnPredWage = lnw if immig == 0 // use actual instead of predicted wages for natives
sort year lnPredWage 
by year: gen rank_predNoX=sum(x)
gen natbel_predNoX= rank_predNoX/ totwage

* PREPARE THE LOG ODD RATIO (SEE FOOTNOTE 17 ON DUSTMANN, FRATTINI, AND PRESTON, 2013)
gen immpos=log(natbel/(1-natbel))
gen immpos_pred=log(natbel_pred/(1-natbel_pred))
gen immpos_predNoX=log(natbel_predNoX/(1-natbel_predNoX))

* GENERATE THE VALUES AT WHICH THE DENSITY OF VENEZUELANS SHOULD BE ESTIMATED
gen percentile=_n
replace percentile=. if percentile>=100
gen pctile=percentile/100
gen pctiletrans=log(pctile/(1-pctile))


***************************
/*ESTIMATION ACTUAL WAGES*/
***************************
* ALL MIGRANTS FROM VENEZUELA (REPATRIATES + VENEZUELANS + VENEZUELANS +5)
kdensity immpos if immig >= 1, generate (perc_imm dens_imm)  at(pctiletrans)
* REPATRIATES
kdensity immpos if immig == 1 , generate (perc_immclass1 dens_immclass1)  at(pctiletrans)
* VENEZUELANS SHORT TERM (< 1 YEAR)
kdensity immpos if immig == 2 , generate (perc_immclass2 dens_immclass2)  at(pctiletrans)
* VENEZUELANS 1-5 YEARS
kdensity immpos if immig == 3 , generate (perc_immclass3 dens_immclass3)  at(pctiletrans)
* VENEZUELANS >= 5YEARS
kdensity immpos if immig == 4 , generate (perc_immclass4 dens_immclass4)  at(pctiletrans)


*******************************
/*ESTIMATION PREDICTED WAGES*/
******************************
* ALL MIGRANTS FROM VENEZUELA (REPATRIATES + VENEZUELANS + VENEZUELANS +5)
kdensity immpos_pred if immig >= 1 , generate (perc_imm_pred dens_imm_pred) at(pctiletrans)
* REPATRIATES
kdensity immpos_pred if immig == 1 , generate (perc_immclass1_pred dens_immclass1_pred)  at(pctiletrans)
* VENEZUELANS SHORT TERM (< 1 YEAR)
kdensity immpos_pred if immig == 2, generate (perc_immclass2_pred dens_immclass2_pred) at(pctiletrans)
* VENEZUELANS 1-5 YEARS
kdensity immpos_pred if immig == 3, generate (perc_immclass3_pred dens_immclass3_pred) at(pctiletrans)
* VENEZUELANS >= 5YEARS
kdensity immpos_pred if immig == 4, generate (perc_immclass4_pred dens_immclass4_pred) at(pctiletrans)


*************************************
/*ESTIMATION PREDICTED WAGES NOT X*/
*************************************
* ALL MIGRANTS FROM VENEZUELA (REPATRIATES + VENEZUELANS + VENEZUELANS +5)
kdensity immpos_predNoX if immig >= 1 , generate (perc_imm_predNoX dens_imm_predNoX)  at(pctiletrans)
* REPATRIATES
kdensity immpos_predNoX if immig == 1, generate (perc_immclass1_predNoX dens_immclass1_predNoX)  at(pctiletrans)
* VENEZUELANS SHORT TERM (< 1 YEAR)
kdensity immpos_predNoX if immig == 2, generate (perc_immclass2_predNoX dens_immclass2_predNoX)  at(pctiletrans)
* VENEZUELANS 1-5 YEARS
kdensity immpos_predNoX if immig == 3, generate (perc_immclass3_predNoX dens_immclass3_predNoX)  at(pctiletrans)
* VENEZUELANS >= 5YEARS
kdensity immpos_pred if immig == 4, generate (perc_immclass4_predNoX dens_immclass4_predNoX)  at(pctiletrans)

* APPLY TRANSFORMATION TO THE DENSITY ESTIMATES FOR ACTUAL WAGES
gen density_imm=dens_imm/(pctile*(1-pctile))
gen density_immclass1=dens_immclass1/(pctile*(1-pctile))
gen density_immclass2=dens_immclass2/(pctile*(1-pctile))
gen density_immclass3=dens_immclass3/(pctile*(1-pctile))
gen density_immclass4=dens_immclass4/(pctile*(1-pctile))

* APPLY TRANSFORMATION TO THE DENSITY ESTIMATES FOR PREDICTED WAGES
gen density_imm_pred=dens_imm_pred/(pctile*(1-pctile))
gen density_immclass1_pred=dens_immclass1_pred/(pctile*(1-pctile))
gen density_immclass2_pred=dens_immclass2_pred/(pctile*(1-pctile))
gen density_immclass3_pred=dens_immclass3_pred/(pctile*(1-pctile))
gen density_immclass4_pred=dens_immclass4_pred/(pctile*(1-pctile))

* APPLY TRANSFORMATION TO THE DENSITY ESTIMATES FOR PREDICTED WAGES WITHOUT ADDING THE X
gen density_imm_predNoX=dens_imm_predNoX/(pctile*(1-pctile))
gen density_immclass1_predNoX=dens_immclass1_predNoX/(pctile*(1-pctile))
gen density_immclass2_predNoX=dens_immclass2_predNoX/(pctile*(1-pctile))
gen density_immclass3_predNoX=dens_immclass3_predNoX/(pctile*(1-pctile))
gen density_immclass4_predNoX=dens_immclass4_predNoX/(pctile*(1-pctile))

*DEFINE LABELS
label var percentile "Percentile of non-immigrant wage distribution"

label var density_imm "All migrants from Venezuela"
label var density_immclass1 "Returnees"
label var density_immclass2 "Venezuelans short-term (arrived < 1 year ago)"
label var density_immclass3 "Venezuelans arrived 1-5 years ago"
label var density_immclass4 "Venezuelans +5 years"

label var density_imm_pred "All migrants from Venezuela predicted"
label var density_immclass1_pred "Returnees predicted"
label var density_immclass2_pred "Venezuelans short-term (arrived < 1 year ago) predicted"
label var density_immclass3_pred "Venezuelans arrived 1-5 years ago predicted"
label var density_immclass4_pred "Venezuelans +5 years predicted"

label var density_imm_predNoX "All migrants from Venezuela not X"
label var density_immclass1_predNoX "Returnees not X"
label var density_immclass2_predNoX "Venezuelans short-term (arrived < 1 year ago) not X"
label var density_immclass3_predNoX "Venezuelans arrived 1-5 years ago not X"
label var density_immclass4_predNoX "Venezuelans +5 years not X"

* GENERATE VARIABLE TO PLOT NATIVES PERCENTILES
gen one = 1 
label var one "Non-immigrant"

*******************
* PLOT GRAPH
*******************

/*Actual vs Predicted: ALL MIGRANTS FROM VENEZUELAN: REPATRIATES, VENEZUELANS SHORT-TERM, AND VENEZUELANS +5*/
twoway ///
(line density_imm percentile , sort lpattern(longdash) lcolor(green)) ///	
(line density_imm_pred percentile , sort lpattern(line) lcolor(red)) ///
(line density_immclass2 percentile , sort lpattern(longdash) lcolor(blue)) ///	
(line one percentile, sort lcolor(gray)) ///
if percentile>4.5 & percentile<95.5, graphregion(color(white)) title("") ///
ytitle("Density") xtitle("Percentile of nonimmigrant wage distribution") scheme(sj)   ///
legend(region(lstyle(none) color(white)) label(1 "All migrants from Venezuela") label(2 "All migrants from Venezuela predicted") label(3 "Venezuelan-born immigrants (arrived 12-months ago)") label(4 "Non-immigrants") rowgap(0) colgap(0) cols(1) size(small)) 
* SAVE GRAPH
qui graph save "$results/1 Figures Paper/FigA9_1.gph", replace
qui graph export "$results/1 Figures Paper/FigA9_1.eps", as(eps) replace


